{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "from argparse import Namespace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = Namespace(\n",
    "    raw_train_dataset_csv=\"data/yelp/raw_train.csv\",\n",
    "    raw_test_dataset_csv=\"data/yelp/raw_test.csv\",\n",
    "    train_proportion=0.7,\n",
    "    val_proportion=0.3,\n",
    "    output_munged_csv=\"data/yelp/reviews_with_splits_full.csv\",\n",
    "    seed=1337\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read raw data\n",
    "train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])\n",
    "train_reviews = train_reviews[~pd.isnull(train_reviews.review)]\n",
    "test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])\n",
    "test_reviews = test_reviews[~pd.isnull(test_reviews.review)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Unfortunately, the frustration of being Dr. Go...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Been going to Dr. Goldberg for over 10 years. ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>I don't know what Dr. Goldberg was like before...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>I'm writing this review to give you a heads up...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>All the food is great here. But the best thing...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rating                                             review\n",
       "0       1  Unfortunately, the frustration of being Dr. Go...\n",
       "1       2  Been going to Dr. Goldberg for over 10 years. ...\n",
       "2       1  I don't know what Dr. Goldberg was like before...\n",
       "3       1  I'm writing this review to give you a heads up...\n",
       "4       2  All the food is great here. But the best thing..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Ordered a large Mango-Pineapple smoothie. Stay...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Quite a surprise!  \\n\\nMy wife and I loved thi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>First I will say, this is a nice atmosphere an...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>I was overall pretty impressed by this hotel. ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>Video link at bottom review. Worst service I h...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rating                                             review\n",
       "0       1  Ordered a large Mango-Pineapple smoothie. Stay...\n",
       "1       2  Quite a surprise!  \\n\\nMy wife and I loved thi...\n",
       "2       1  First I will say, this is a nice atmosphere an...\n",
       "3       2  I was overall pretty impressed by this hotel. ...\n",
       "4       1  Video link at bottom review. Worst service I h..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{1, 2}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Unique classes\n",
    "set(train_reviews.rating)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splitting train by rating\n",
    "# Create dict\n",
    "by_rating = collections.defaultdict(list)\n",
    "for _, row in train_reviews.iterrows():\n",
    "    by_rating[row.rating].append(row.to_dict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create split data\n",
    "final_list = []\n",
    "np.random.seed(args.seed)\n",
    "\n",
    "for _, item_list in sorted(by_rating.items()):\n",
    "\n",
    "    np.random.shuffle(item_list)\n",
    "    \n",
    "    n_total = len(item_list)\n",
    "    n_train = int(args.train_proportion * n_total)\n",
    "    n_val = int(args.val_proportion * n_total)\n",
    "    \n",
    "    # Give data point a split attribute\n",
    "    for item in item_list[:n_train]:\n",
    "        item['split'] = 'train'\n",
    "    \n",
    "    for item in item_list[n_train:n_train+n_val]:\n",
    "        item['split'] = 'val'\n",
    "\n",
    "    # Add to final list\n",
    "    final_list.extend(item_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for _, row in test_reviews.iterrows():\n",
    "    row_dict = row.to_dict()\n",
    "    row_dict['split'] = 'test'\n",
    "    final_list.append(row_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write split data to file\n",
    "final_reviews = pd.DataFrame(final_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train    392000\n",
       "val      168000\n",
       "test      38000\n",
       "Name: split, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews.split.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    The entrance was the #1 impressive thing about...\n",
       "1    I'm a Mclover, and I had no problem\\nwith the ...\n",
       "2    Less than good here, not terrible, but I see n...\n",
       "3    I don't know if I can ever bring myself to go ...\n",
       "4    Food was OK/Good but the service was terrible....\n",
       "Name: review, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews.review.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [rating, review, split]\n",
       "Index: []"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews[pd.isnull(final_reviews.review)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess the reviews\n",
    "def preprocess_text(text):\n",
    "    if type(text) == float:\n",
    "        print(text)\n",
    "    text = text.lower()\n",
    "    text = re.sub(r\"([.,!?])\", r\" \\1 \", text)\n",
    "    text = re.sub(r\"[^a-zA-Z.,!?]+\", r\" \", text)\n",
    "    return text\n",
    "    \n",
    "final_reviews.review = final_reviews.review.apply(preprocess_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>negative</td>\n",
       "      <td>the entrance was the impressive thing about th...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>negative</td>\n",
       "      <td>i m a mclover , and i had no problem nwith the...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>negative</td>\n",
       "      <td>less than good here , not terrible , but i see...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>negative</td>\n",
       "      <td>i don t know if i can ever bring myself to go ...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>negative</td>\n",
       "      <td>food was ok good but the service was terrible ...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     rating                                             review  split\n",
       "0  negative  the entrance was the impressive thing about th...  train\n",
       "1  negative  i m a mclover , and i had no problem nwith the...  train\n",
       "2  negative  less than good here , not terrible , but i see...  train\n",
       "3  negative  i don t know if i can ever bring myself to go ...  train\n",
       "4  negative  food was ok good but the service was terrible ...  train"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_reviews.to_csv(args.output_munged_csv, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlpbook",
   "language": "python",
   "name": "nlpbook"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "12px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": "5",
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
